Searching for functional elements in the intergenic sequence between the orthologous gene pairs across all species


1. Identifying conservation of RNA secondary structures in the intergenic sequences (igsq)

Using Vienna package [website link](http://www.tbi.univie.ac.at/RNA/). I downloaded the package (although it comes with an easy ubuntu install, i did not do this. Instead i built the package to have the python wrapper). After importing RNA, you can run the script 'fold' to calculate the best secondary structure.

In [26]:
# Testing the vienna rnafold package
import os
import sys
import re

sys.path.append('/home/jaggu/research/downloads/vienna/viennaRNA/lib/python2.7/site-packages')
import RNA

dotNot = RNA.fold('CCCGGCGTGGG')[0]
print dotNot
re.findall("\(+",dotNot)
_rep = re.sub("\(+","S",dotNot)
_rep = re.sub("\)+","S",_rep)
slNotation = re.sub("\.+","L",_rep)


(((.....)))
Out[26]:
'SLS'

In [1]:
import os
import sys
import cPickle as pickle
import time

def loadPkl(fname):
    pklDir = '/home/jaggu/research/projectFiles/operons/pklFiles'
    f = os.path.join(pklDir,fname)
    db = pickle.load(open(f))
    return db

def savePkl(db,pklFname):
    pklDir = '/home/jaggu/research/projectFiles/operons/pklFiles'    
    f = os.path.join(pklDir,pklFname)
    pickle.dump(db,open(f,'w'))
    return

In [ ]:
# Importing dictionaries

org_lTagPairIGSeq_dict = loadPkl('org_locusTagPairInterGeneSeq.dict.pkl')
locus_cog_dict = loadPkl('locus_cog.dict.pkl')

print "Org_lTagPairIGSeq_dict loaded",time.ctime()

In [6]:
# Parsing each IG seq as a RNA and by using RNAfold from Vienna, I reading out an SLNotation (Stem,Loop;). 
# Then every COG pair gets a list of these secondarystructure. Then I parse the COG pairs to get the frequency 
# of each secondary structure
sys.path.append('/home/jaggu/research/downloads/vienna/viennaRNA/lib/python2.7/site-packages')
import RNA
import re

def getSStr(dnaSeq):
    dotNot = RNA.fold(dnaSeq)[0]
    re.findall("\(+",dotNot)
    _rep = re.sub("\(+","S",dotNot)
    _rep = re.sub("\)+","S",_rep)
    slNotation = re.sub("\.+","L",_rep)
    return slNotation

for org,lTagIGSeq_list in org_lTagPairIGSeq_dict.items():
    print org, len(lTagIGSeq_list)
    for lTag1,lTag2,dnaSeq in lTagIGSeq_list:
        print lTag1,lTag2
        secStr = getSStr(dnaSeq)
        print dnaSeq, secStr
        sys.exit(1)
        break;


('Escherichia_coli__BL21_Gold_DE3_pLysS_AG__uid59245', 'NC_012947') 4228
ECBD_0001 ECBD_0002
ACCTA L
An exception has occurred, use %tb to see the full traceback.

SystemExit: 1
To exit: use 'exit', 'quit', or Ctrl-D.

In [ ]: